# Description ------------------------------------------------------------------

### This cleans the tax collector survey variables needed for analysis for the
### TAD Impact Evaluation Project, merges in treatment status, then saves cleaned
### version as .Rdata and .dta files

# Packages ---------------------------------------------------------------------
#if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(purrr)
library(haven)
library(labelled)
library(hms)

# Convenience Functions --------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")

# Cleaning ---------------------------------------------------------------------

# Import uncleaned tax collector data
tax_collector_end <- read_dta('data/1_raw/tad_endline_tax_collector_nopii.dta')

## Fixing market labels
# goal is to import labels from vendor survey; but there is one number diff
tax_collector_end$market <- remove_val_labels(tax_collector_end$market)
tax_collector_end[tax_collector_end$market == 312, 'market'] <- 23

# replace labels from tax collector survey with those from vendor survey
val_labels(tax_collector_end$market) <- val_labels(read_dta('data/1_raw/tad_endline_market_long_nopii.dta')$market)

# checking to see which markets are missing from tax collector
unique(to_factor(read_dta('data/1_raw/tad_endline_market_long_nopii.dta')$market))[
  unique(to_factor(read_dta('data/1_raw/tad_endline_market_long_nopii.dta')$market)) %nin% 
    unique(to_factor(tax_collector_end$market))]

#Mankhaka and Mpala

#removing unnecessary markets from labels
not_used <- with(tax_collector_end, 
                 val_labels(market)[!(names(val_labels(market)) %in%
                                        unique(to_factor(market)))])
tax_collector_end <- tax_collector_end %>% remove_value_labels(market = not_used)

## Fixing district labels

#remove pilot label from district data
val_label(tax_collector_end$district, 9) <- NULL

## Cleaning TD Outcome Variables

#e10, e11
# function to asssist
to_hours <- function(x){
  time <- as.numeric(x[1]) 
  if(x[4] == "PM" & x[1] != '12')
    time <- time + 12
  if(x[4] == "AM" & x[1] == "12")
    time <- 24
  time <- time + as.numeric(x[2])/60
  time
} 

start_time <- str_split(tax_collector_end$e10, pattern = ":| ")
start_time <- map_dbl(start_time, .f = to_hours)

end_time <- str_split(tax_collector_end$e11, pattern = ":| ")
end_time <- map_dbl(end_time, .f = to_hours)

#outcome questions
tax_collector_end <- mutate(tax_collector_end,
                list_treat = ifelse(!is.na(list_treatment) & 
                                      is.na(list_control), 
                                    1, 
                                    ifelse(!is.na(list_control) &
                                             is.na(list_treatment),
                                           0, NA)),
                list_outcome = ifelse(!is.na(list_treatment) & 
                                        is.na(list_control) &
                                        list_treatment >= 0, 
                                      list_treatment, 
                                      ifelse(!is.na(list_control) &
                                               is.na(list_treatment) &
                                               list_control >= 0,
                                             list_control, NA)),
                hrs_in_mkt = end_time - start_time,
                hrs_in_mkt_fix = ifelse(hrs_in_mkt < 0, NA, hrs_in_mkt),
                vendors_visited = neg_to_na(e12),
                vendors_visited_trim_99 = 
                  neg_to_na(vendors_visited, outlier_top = 
                              quantile(vendors_visited, probs = .99,
                                       na.rm = T)),
                vendors_visited_top_99 = ifelse(vendors_visited > 
                                                  quantile(vendors_visited,
                                                                  probs = .99,
                                                                  na.rm = T),
                                                quantile(vendors_visited,
                                                         probs = .99,
                                                         na.rm = T),
                                                vendors_visited),
                how_paid = neg_to_na(e15),
                change_in_how_paid = rev_values(neg_to_na(e16)) - 1)

change_pay_lab <- c("No" = 0, 
                    "Yes, stopped getting bonuses/commission/incentives" = 1, 
                    "Yes, went from fixed salary to getting a bonus, commisssion, or incentive" = 2)
          
val_labels(tax_collector_end) <- list(change_in_how_paid = change_pay_lab)


#demographic variables
tax_collector_end <- tax_collector_end %>% 
  mutate(gender = neg_to_na(r1),
         female = ifelse(gender == 1, 1, 0),
         age = neg_to_na(d8, outlier_top = 200),
         literacy = rev_values(neg_to_na(d14)),
         literacy_high = ifelse(literacy == 4, 1, 0),
         literacy_any = ifelse(literacy == 3 | literacy == 4, 1, 0),
         education = ifelse(d12 == 888, 0, neg_to_na(d12) + 1),
         marital_status = neg_to_na(d6),
         married = ifelse(marital_status == 2 | 
                            marital_status == 3, 1, 0),
         spk_english = rev_values(neg_to_na(d16)),
         no_english = ifelse(spk_english == 1, 1, 0),
         days_wrk_mkt = neg_to_na(e9),
         mnthly_earn = neg_to_na(e20),
         hh_income = neg_to_na(e25))

education_lab <- c(None = 0, "Nursery School" = 1,
              "Standard 1" = 2, "Standard 2" = 3,
              "Standard 3" = 4, "Standard 4" = 5,
              "Standard 5" = 6, "Standard 6" = 7,
              "Standard 7" = 8, "Standard 8" = 9,
              "Form 1" = 10, "JCE/Form 2" = 11,
              "Form 3" = 12, "MSCE/Form 4" = 13,
              "Technical/Private College (non-Degree)" = 16,
              Degree = 17, Masters = 18, PhD = 19)
literacy_lab <- c("Could not read" = 1, 
             "Could read some of the card" = 2,
             "Could read the whole card with difficulty" = 3,
             "Could read the whole card with ease" = 4)

#NA values labels that have to be removed for factor purposes
na_vec <- c(-66, -77, -88, -99)

#removing NA values for factor purposes
tax_collector_end <- tax_collector_end %>% remove_NA_labels(na_vec, start = 336,
                                                            end = 356)

val_labels(tax_collector_end) <- list(education = education_lab,
                                      literacy = literacy_lab)

# spillover questions
tax_collector_end <- tax_collector_end %>% 
  mutate(collect_othr_mkts = neg_to_na(e18),
         jobs_othr_mkts = neg_to_na(e6),
         othrs_ask_transfer_away = neg_to_na(oa1),
         othrs_succ_transfer_away = neg_to_na(oa2),
         othrs_ask_transfer_here = neg_to_na(oa4),
         othrs_succ_transfer_here = neg_to_na(oa5),
         transfer_btwn_mkts = neg_to_na(oa7),
         ask_transfer_better_pay = neg_to_na(oa9_1),
         ask_transfer_job_easier = neg_to_na(oa9_2),
         ask_transfer_personal_reason = neg_to_na(oa9_3),
         transferred_promotion = neg_to_na(oa9_b_1),
         transferred_mkt_needed_staff = neg_to_na(oa9_b_2),
         transferred_unknown = neg_to_na(oa9_b_3),
         compl_surv_othr_mkt = neg_to_na(final1),
         final2_clean = neg_to_na(final2),
         date_transfer = oa8,
         date_othr_surv = final2a)

#removing NA values for factor purposes
tax_collector_end <- tax_collector_end %>% remove_NA_labels(na_vec, start = 359,
                                                            end = 373)

#compliance questions
tax_collector_end <- tax_collector_end %>% 
  mutate(meeting_held = ifelse(neg_to_na(te1) == 1, 1, 0),
         meeting_attended = ifelse(neg_to_na(te2) == 1, 1, 0),
         meeting_disc_dev = te3_1,
         meeting_disc_pay_fees = te3_2,
         meeting_disc_corrup = te3_3,
         meeting_disc_use_fees = te3_4,
         meeting_disc_sell_outside = te3_5,
         ms_disc_toilets = te3_ms_1,
         ms_disc_water = te3_ms_2,
         ms_disc_security = te3_ms_3,
         ms_disc_roofs_stalls = te3_ms_4,
         ms_disc_paths = te3_ms_5,
         ms_disc_trash = te3_ms_6,
         ms_disc_electricity = te3_ms_7,
         rev_info_system = rev_values(neg_to_na(te8)), #te label (- 1 when num)
         how_acc_info = rev_values(neg_to_na(te10)), #accurate label
         info_source_sms = te9_1,
         info_source_poster = te9_2,
         info_source_told = te9_3,
         info_source_mkt_com = te9_4,
         info_source_mkt_mang_fc = te9_5,
         sms_system = neg_to_na(te9a),
         any_constr = ifelse(neg_to_na(te5) == 1, 1, 0),
         constr_toilets = te6_1,
         constr_water = te6_2,
         constr_security = te6_3,
         constr_roofs_stalls = te6_4,
         constr_paths = te6_5,
         constr_trash = te6_6,
         constr_electricity = te6_7,
         who_resp_constr_mkt_mang = te7_1,
         who_resp_constr_wc = te7_2,
         who_resp_constr_dg = te7_3,
         who_resp_constr_mp = te7_4,
         who_resp_constr_USAID = te7_5,
         who_resp_constr_vendors = te7_6,
         how_paid = neg_to_na(e15), #paid label
         paid_incentives = ifelse(how_paid == 2, 1, 0),
         paid_all_commision = ifelse(how_paid == 4, 1, 0),
         monthly_earn = e17,
         what_happens_to_fees = neg_to_na(oa16), #fees_go label
         fees_dir_mobile_mon = ifelse(what_happens_to_fees == 3, 1, 0),
         how_sup_transf = neg_to_na(oa17), #transfer label
         mobile_money = ifelse(how_sup_transf == 1, 1, 0),
         how_oft_sup_transf = rev_values(neg_to_na(oa18)), #freq_transfer label
         vendor_counting = ifelse(te11 == 1, 1, 0),
         count_to_set_targets = te11a_1,
         count_to_help_dist_wserv = te11a_2,
         how_often_talk_counter = rev_values(neg_to_na(te11b)), #counter_talk label
         talked_to_counter = ifelse(how_often_talk_counter > 1, 1, 0),
         how_often_counter_come = neg_to_na(te11c), #counter_freq label
         who_sent_counter = neg_to_na(te12), #counter label
         how_often_sup_visit = rev_values(neg_to_na(oa11)) #zonevisit label
         )

#removing NA values for factor purposes
tax_collector_end <- tax_collector_end %>% remove_NA_labels(na_vec, start = 376)

# relabel
te13_label <- c("No" = 1,
                "Yes - can find out how spent only" = 2,
                "Yes - can find out amount collected only" = 3,
                "Yes - find out both amount collect and how it was spent" = 4)  

accurate_label <- c("Very inaccurate" = 1, 
                    "Somewhat inaccurate" = 2,
                    "Somewhat accurate" = 3, 
                    "Very accurate" = 4)

fees_go_lab <- c("Bring them to the market manager" = 1,
                 "Bring them to the district revenue official" = 2,
                 "Give them to a mobile money vendor" = 3,
                 "Keep them until supervisor visits market to collect them (everyday)" = 4,
                 "Keep them until supervisor visits market to collect them (several days at once)" = 5,
                 "Deposit them at the bank" = 6)

paid_label <- c("A fixed salary, no matter how much you collect in fees" = 1,
                "A fixed salary, plus incentives if you reach certain targets" = 2,
                "A fixed salary, plus a certain percentage of what you collect" = 3,
                "No fixed salary, it just depends on how much you collect" = 4)

transfer_lab <- c("Via mobile money" = 1,
                  "Goes in person" = 2,
                  "Gives it to their supervisor who take the money in person" = 3,
                  "Deposit them at the bank" = 4)

freq_trans_lab <- c("Once a year" = 1,
                    "Once every few months" = 2,
                    "Once a month" = 3,
                    "A few times a month (2-3 times per month)" = 4,
                    "Once a week" = 5,
                    "Every 5-6 days" = 6,
                    "Every 3-4 days" = 7,
                    "Every other day" = 8,
                    "Every Day" = 9)

counter_talk_lab <- c("No, never" = 1,
                      "Yes, sometimes" = 2, 
                      "Yes, every time they come" = 3)

counter_freq_lab <- c("Once a year" = 1,
                      "A few times a year (2-4 times)" = 2,
                      "At least once a month" = 3,
                      "More than once a month" = 4)

counter_lab <- c("District government" = 1,
                 "National government" = 2)

zonevisit_lab <- c("Never (in the past year)" = 1,
                   "Once in the past year" = 2,
                   "At least twice per year" = 3,
                   "At least 4 times in the past year" = 4,
                   "At least once per month" = 5,
                   "At least twice per month" = 6,
                   "Once a week or more" = 7,
                   "Daily" = 8)
                     
val_labels(tax_collector_end) <- list(rev_info_system = te13_label,
                                      how_acc_info = accurate_label,
                                      how_paid = paid_label,
                                      what_happens_to_fees = fees_go_lab,
                                      how_sup_transf = transfer_lab,
                                      how_oft_sup_transf = freq_trans_lab,
                                      how_often_counter_come = counter_freq_lab,
                                      how_often_talk_counter = counter_talk_lab,
                                      who_sent_counter = counter_lab,
                                      how_often_sup_visit = zonevisit_lab)


# Post-Cleaning Tasks ----------------------------------------------------------

#convert labelled integers to factors
tax_collector_end <- mutate_if(tax_collector_end, is.labelled, to_factor)
head(tax_collector_end)

#merge in treatment assignment
tax_collector_end <- left_join(tax_collector_end, 
                               read_csv('data/2_clean/treatment_groups.csv'),
                        by = c("market" = "Market", "district" = "District"))

#create BU and TD treatment indicators
tax_collector_end <- mutate(tax_collector_end,
                            BU_treat = str_detect(treatment_status, "BU|BOTH") %>% 
                              as.numeric(),
                            TD_treat = str_detect(treatment_status, "TD|BOTH") %>% 
                              as.numeric(),
                            BU = ifelse(BU_treat == 1 & TD_treat == 0,
                                        1, 0),
                            TD = ifelse(BU_treat == 0 & TD_treat == 1,
                                        1, 0),
                            Both = ifelse(BU_treat == 1 & TD_treat == 1,
                                          1, 0))

#merge in treatment assignment block ids
tax_collector_end <- left_join(tax_collector_end, 
                               read_dta("data/1_raw/block_ids.dta") %>% 
                          mutate_if(is.labelled, to_factor),
                        by = c("market" = "market", "district" = "district"))

#also turn some factors into numeric
tax_collector_end <- tax_collector_end %>% 
  mutate(how_acc_info_num = as.numeric(how_acc_info),
         how_paid_num = as.numeric(how_paid),
         how_oft_sup_transf_num = as.numeric(how_oft_sup_transf),
         how_often_counter_come_num = as.numeric(how_often_counter_come),
         how_often_talk_counter_num = as.numeric(how_often_talk_counter),
         how_often_sup_visit_num = as.numeric(how_often_sup_visit),
         rev_info_system_dich = ifelse(rev_info_system == "No", 0, 1))

#making education numberic
tax_collector_end$educ_num <- as.numeric(tax_collector_end$education) - 1
tax_collector_end$educ_none <- 1*(tax_collector_end$educ_num == 0)
tax_collector_end <- tax_collector_end %>% 
  mutate(educ_cat = ifelse(educ_num > 0 & educ_num <= 9, 1,
                           ifelse(educ_num > 9 & educ_num <= 13, 2,
                                  ifelse(educ_num > 13, 3, educ_num))))

# Saving -----------------------------------------------------------------------
#save as RData file
save(tax_collector_end, file = "data/2_clean/tax_collector_end.RData")
